##### Additional analysis #####
#Remove objects
rm(list = ls())

#Load pkgs
library(tidyverse)
library(xtable)
library(fastDummies)
library(brglm)
library(sandwich)
library(lmtest)
library(margins)
library(texreg)
library(parallel)
library(readxl)

#Load 100 topics data#

#Load main dataset
df_merged_mean_100 <- read_delim("input/df_merged_mean_100.csv", 
                                    ";", escape_double = FALSE, col_types = cols(date_text = col_date(format = "%d/%m/%Y")), 
                                    trim_ws = TRUE)
labels100 <- read_excel("input/topics100_labeled.xlsx")

#Load labels


#Load helper functions
source("scripts/helper_functions.R")


#Check correlation between websites and topics

#Create Dummies for each website
df_merged_mean_100 <- fastDummies::dummy_cols(df_merged_mean_100, select_columns =
                                            c("website_id"), remove_first_dummy=F)


#First correlation matrix
corr <- round(cor(df_merged_mean_100[,c(6,19:137)],use="pairwise"), 1)
head(corr[, 1:4])

#Exclude topics that correlate strongly (>=0.6) to specific websites
index <- which(abs(corr) > .5 & abs(corr) < 1, arr.ind = T)


exclude <- unique(row.names(index))
names_fe_100 <- setdiff(names(df_merged_mean_100[,c(19:118)]),exclude)


#Create Dummies for week and day for robust
df_merged_mean_100 <- df_merged_mean_100 %>%
  group_by(date_text) %>%
  mutate(date_pos_rob=  ifelse(var(off_rob)>0,date_text,0)) %>%
  ungroup()


df_merged_mean_100 <- df_merged_mean_100 %>%
  group_by(website_id) %>%
  mutate(off_l= dplyr::lag(off,order_by=website_id)) %>%
  mutate(off_999_rob_l= dplyr::lag(off_999_rob,order_by=website_id)) %>%
  mutate(off_500_rob_l= dplyr::lag(off_500_rob,order_by=website_id)) %>%
  mutate(off_rob_l= dplyr::lag(off_rob,order_by=website_id)) %>%
  ungroup()

#Define i as character value
i = "Accidents_deaths"


run_model_100_strong <- function(i){
  e1 <- myTryCatch(model <- brglm(off_rob~scale(get(i))+factor(website_id)+off_rob_l+
                                    factor(date_pos_rob),data=df_merged_mean_100,
                                  family=binomial(link="logit"),method="brglm.fit",pl=T))
  vcov <- vcovCL(model, cluster = ~ website_id,type="HC1")
  rob <- coeftest(model ,vcov = vcov)
  se <- rob[,2]
  p_value <- rob[,4]
  sign <- ifelse(p_value[2]<0.05/281,1,0)
  pos <- ifelse(rob[2,1]>0,1,0)
  error <- ifelse(e1[[2]][1]=="Iteration limit reached",1,0)
  return(list(model,se,p_value,sign,pos,error))
}

models_100 <- sapply(names_fe_100, run_model_100_strong)

#Print convergence status of models

converge <- unlist(models_100[6,])

#Display models with iteration limit reached
limit_reached <- str_remove(names(converge[converge==1]),".message")


names_fe_100 <- as.data.frame(names_fe_100)
colnames(names_fe_100) <- "label_output"

names_fe_100 <- left_join(names_fe_100,dplyr::select(labels100,label_output,label_name),by="label_output")

#Report models
htmlreg(models_100[1,1:86],override.se = models_100[2,1:86],override.pvalues = models_100[3,1:86],
       custom.model.names=names_fe_100$label_name[1:86],
       custom.coef.map = list('scale(get(i))'="Topic",
                              'off_rob_l'="DoS attack (t-1)"),
       stars = c(0.0001779359, 0.05),
      file = "output/tables/Table_D5_topics_100.html",
      caption = paste0("Iteration limit reached for: ",paste(limit_reached,collapse=", "),
                       ". Results for these models are manually removed in the formatted table.")
      ) #Include information on topics with iteration limit

#Create variables vector for simulation
pos_names_100 <- names(unlist(models_100[5,][(models_100[5,]==1 & models_100[4,]==1)
]))

#Leave out models with iteration limit
pos_names_100 <- setdiff(pos_names_100,limit_reached)

marginal_model_sim_100 <- function(variable_name){
  library(brglm)
  library(margins)
  library(sandwich)
  library(tidyverse)
  set.seed(123)
  formula <- as.formula(paste0("off_rob~",paste0(variable_name,"+off_rob_l+factor(website_id)+factor(date_pos_rob)")))
  m <-  brglm(formula,data=df_merged_mean_100,
              family=binomial(link="logit"),method="brglm.fit",pl=T)
  margins <- margins(m,vcov=vcovCL(m,cluster = ~ website_id,type="HC1"),vce="simulation", iterations=1000,
                     change="minmax",data=df_merged_mean_100)
  return(list(margins))
}

run_marginal_models_100 <- function(names){
  # Run marignal models
  start.time <- Sys.time()
  
  # Calculate the number of cores
  no_cores <- detectCores(all.tests = FALSE, logical = TRUE)
  no_cores <- no_cores-1
  
  # Initiate cluster
  cl <- makeCluster(no_cores)
  
  clusterExport(cl, list("df_merged_mean_100","marginal_model_sim_100"))
  
  marginal_models <- parLapply(cl, 
                               c(names), 
                               marginal_model_sim_100)
  
  stopCluster(cl)
  end.time <- Sys.time()
  return(marginal_models)
}

set.seed(123)
marginal_models_pos_100 <- run_marginal_models_100(pos_names_100) 


names_fe <- pos_names_100
names_output_100 <- as.data.frame(names_fe)
names_output_100$topic_names_text_fe <- c("Sport","Healthcare","Government ministers","National assembly",
                                          "Puigdemont/Mugabe purge (mixed)","Election (general)",
                                          "Bonus payments","Licenses","Regional organizations",
                                          "Crime/deaths","Opposition Frente Amplio","Election (procedures)",
                                          "Oscar Perez/repression","CLAP","Company ","Exiled opposition",
                                          "Exchange rate","Caracas metro",
                                          "Mendoza","Russia-UK/diplomat expulsion","Protest/riots","Russia","Mining",
                                          "Cuba","Lima summit","Opposition campaigns/holidays (mixed)",
                                          "Police","Political prisoners")
names_output_100$category <- c("Nonpolitical","Social/economic crisis","Domestic politics","Political legitimacy","International politics","Political legitimacy",
                               "Social/economic crisis","Social/economic crisis","Political legitimacy","Nonpolitical",
                               "Political legitimacy","Political legitimacy","Protest/repression","Social/economic crisis","Social/economic crisis",
                               "Political legitimacy","Social/economic crisis","Social/economic crisis",
                               "Political legitimacy","International politics","Protest/repression","International politics",
                               "Social/economic crisis","International politics","Political legitimacy","Political legitimacy","Protest/repression",
                               "Protest/repression")


pos_100 <-plot_simulations(marginal_models_pos_100,
                           pos_names_100,names_output_100)
pos_100

ggsave("output/figures/Figure_D7_topics100.pdf",width=16, height=8)